# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
import folium
from folium.plugins import HeatMap
import plotly.express as px
plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 32)
# reading data
df = pd.read_csv("C:/Users/VANAM GANESH/Downloads/hotel_bookings.csv/hotel_bookings.csv")
df.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 3 | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 4 | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Direct | Direct | 0 | 0 | 0 | A | C | 0 | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Corporate | Corporate | 0 | 0 | 0 | A | A | 0 | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
df.describe()
| is_canceled | lead_time | arrival_date_year | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | booking_changes | agent | company | days_in_waiting_list | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119386.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 103050.000000 | 6797.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 |
| mean | 0.370416 | 104.011416 | 2016.156554 | 27.165173 | 15.798241 | 0.927599 | 2.500302 | 1.856403 | 0.103890 | 0.007949 | 0.031912 | 0.087118 | 0.137097 | 0.221124 | 86.693382 | 189.266735 | 2.321149 | 101.831122 | 0.062518 | 0.571363 |
| std | 0.482918 | 106.863097 | 0.707476 | 13.605138 | 8.780829 | 0.998613 | 1.908286 | 0.579261 | 0.398561 | 0.097436 | 0.175767 | 0.844336 | 1.497437 | 0.652306 | 110.774548 | 131.655015 | 17.594721 | 50.535790 | 0.245291 | 0.792798 |
| min | 0.000000 | 0.000000 | 2015.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 6.000000 | 0.000000 | -6.380000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 18.000000 | 2016.000000 | 16.000000 | 8.000000 | 0.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 62.000000 | 0.000000 | 69.290000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 69.000000 | 2016.000000 | 28.000000 | 16.000000 | 1.000000 | 2.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 179.000000 | 0.000000 | 94.575000 | 0.000000 | 0.000000 |
| 75% | 1.000000 | 160.000000 | 2017.000000 | 38.000000 | 23.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 229.000000 | 270.000000 | 0.000000 | 126.000000 | 0.000000 | 1.000000 |
| max | 1.000000 | 737.000000 | 2017.000000 | 53.000000 | 31.000000 | 19.000000 | 50.000000 | 55.000000 | 10.000000 | 10.000000 | 1.000000 | 26.000000 | 72.000000 | 21.000000 | 535.000000 | 543.000000 | 391.000000 | 5400.000000 | 8.000000 | 5.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119390 entries, 0 to 119389 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 hotel 119390 non-null object 1 is_canceled 119390 non-null int64 2 lead_time 119390 non-null int64 3 arrival_date_year 119390 non-null int64 4 arrival_date_month 119390 non-null object 5 arrival_date_week_number 119390 non-null int64 6 arrival_date_day_of_month 119390 non-null int64 7 stays_in_weekend_nights 119390 non-null int64 8 stays_in_week_nights 119390 non-null int64 9 adults 119390 non-null int64 10 children 119386 non-null float64 11 babies 119390 non-null int64 12 meal 119390 non-null object 13 country 118902 non-null object 14 market_segment 119390 non-null object 15 distribution_channel 119390 non-null object 16 is_repeated_guest 119390 non-null int64 17 previous_cancellations 119390 non-null int64 18 previous_bookings_not_canceled 119390 non-null int64 19 reserved_room_type 119390 non-null object 20 assigned_room_type 119390 non-null object 21 booking_changes 119390 non-null int64 22 deposit_type 119390 non-null object 23 agent 103050 non-null float64 24 company 6797 non-null float64 25 days_in_waiting_list 119390 non-null int64 26 customer_type 119390 non-null object 27 adr 119390 non-null float64 28 required_car_parking_spaces 119390 non-null int64 29 total_of_special_requests 119390 non-null int64 30 reservation_status 119390 non-null object 31 reservation_status_date 119390 non-null object dtypes: float64(4), int64(16), object(12) memory usage: 29.1+ MB
# checking for null values
null = pd.DataFrame({'Null Values' : df.isna().sum(), 'Percentage Null Values' : (df.isna().sum()) / (df.shape[0]) * (100)})
null
| Null Values | Percentage Null Values | |
|---|---|---|
| hotel | 0 | 0.000000 |
| is_canceled | 0 | 0.000000 |
| lead_time | 0 | 0.000000 |
| arrival_date_year | 0 | 0.000000 |
| arrival_date_month | 0 | 0.000000 |
| arrival_date_week_number | 0 | 0.000000 |
| arrival_date_day_of_month | 0 | 0.000000 |
| stays_in_weekend_nights | 0 | 0.000000 |
| stays_in_week_nights | 0 | 0.000000 |
| adults | 0 | 0.000000 |
| children | 4 | 0.003350 |
| babies | 0 | 0.000000 |
| meal | 0 | 0.000000 |
| country | 488 | 0.408744 |
| market_segment | 0 | 0.000000 |
| distribution_channel | 0 | 0.000000 |
| is_repeated_guest | 0 | 0.000000 |
| previous_cancellations | 0 | 0.000000 |
| previous_bookings_not_canceled | 0 | 0.000000 |
| reserved_room_type | 0 | 0.000000 |
| assigned_room_type | 0 | 0.000000 |
| booking_changes | 0 | 0.000000 |
| deposit_type | 0 | 0.000000 |
| agent | 16340 | 13.686238 |
| company | 112593 | 94.306893 |
| days_in_waiting_list | 0 | 0.000000 |
| customer_type | 0 | 0.000000 |
| adr | 0 | 0.000000 |
| required_car_parking_spaces | 0 | 0.000000 |
| total_of_special_requests | 0 | 0.000000 |
| reservation_status | 0 | 0.000000 |
| reservation_status_date | 0 | 0.000000 |
# filling null values with zero
df.fillna(0, inplace = True)
# visualizing null values
msno.bar(df)
plt.show()
# adults, babies and children cant be zero at same time, so dropping the rows having all these zero at same time
filter = (df.children == 0) & (df.adults == 0) & (df.babies == 0)
df[filter]
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2224 | Resort Hotel | 0 | 1 | 2015 | October | 41 | 6 | 0 | 3 | 0 | 0.0 | 0 | SC | PRT | Corporate | Corporate | 0 | 0 | 0 | A | I | 1 | No Deposit | 0.0 | 174.0 | 0 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 2015-10-06 |
| 2409 | Resort Hotel | 0 | 0 | 2015 | October | 42 | 12 | 0 | 0 | 0 | 0.0 | 0 | SC | PRT | Corporate | Corporate | 0 | 0 | 0 | A | I | 0 | No Deposit | 0.0 | 174.0 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-10-12 |
| 3181 | Resort Hotel | 0 | 36 | 2015 | November | 47 | 20 | 1 | 2 | 0 | 0.0 | 0 | SC | ESP | Groups | TA/TO | 0 | 0 | 0 | A | C | 0 | No Deposit | 38.0 | 0.0 | 0 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 2015-11-23 |
| 3684 | Resort Hotel | 0 | 165 | 2015 | December | 53 | 30 | 1 | 4 | 0 | 0.0 | 0 | SC | PRT | Groups | TA/TO | 0 | 0 | 0 | A | A | 1 | No Deposit | 308.0 | 0.0 | 122 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 2016-01-04 |
| 3708 | Resort Hotel | 0 | 165 | 2015 | December | 53 | 30 | 2 | 4 | 0 | 0.0 | 0 | SC | PRT | Groups | TA/TO | 0 | 0 | 0 | A | C | 1 | No Deposit | 308.0 | 0.0 | 122 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 2016-01-05 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 115029 | City Hotel | 0 | 107 | 2017 | June | 26 | 27 | 0 | 3 | 0 | 0.0 | 0 | BB | CHE | Online TA | TA/TO | 0 | 0 | 0 | A | A | 1 | No Deposit | 7.0 | 0.0 | 0 | Transient | 100.80 | 0 | 0 | Check-Out | 2017-06-30 |
| 115091 | City Hotel | 0 | 1 | 2017 | June | 26 | 30 | 0 | 1 | 0 | 0.0 | 0 | SC | PRT | Complementary | Direct | 0 | 0 | 0 | E | K | 0 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.00 | 1 | 1 | Check-Out | 2017-07-01 |
| 116251 | City Hotel | 0 | 44 | 2017 | July | 28 | 15 | 1 | 1 | 0 | 0.0 | 0 | SC | SWE | Online TA | TA/TO | 0 | 0 | 0 | A | K | 2 | No Deposit | 425.0 | 0.0 | 0 | Transient | 73.80 | 0 | 0 | Check-Out | 2017-07-17 |
| 116534 | City Hotel | 0 | 2 | 2017 | July | 28 | 15 | 2 | 5 | 0 | 0.0 | 0 | SC | RUS | Online TA | TA/TO | 0 | 0 | 0 | A | K | 1 | No Deposit | 9.0 | 0.0 | 0 | Transient-Party | 22.86 | 0 | 1 | Check-Out | 2017-07-22 |
| 117087 | City Hotel | 0 | 170 | 2017 | July | 30 | 27 | 0 | 2 | 0 | 0.0 | 0 | BB | BRA | Offline TA/TO | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 52.0 | 0.0 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2017-07-29 |
180 rows × 32 columns
df = df[~filter]
df
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 3 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 4 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Direct | Direct | 0 | 0 | 0 | A | C | 0 | No Deposit | 0.0 | 0.0 | 0 | Transient | 75.00 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Corporate | Corporate | 0 | 0 | 0 | A | A | 0 | No Deposit | 304.0 | 0.0 | 0 | Transient | 75.00 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 240.0 | 0.0 | 0 | Transient | 98.00 | 0 | 1 | Check-Out | 2015-07-03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 119385 | City Hotel | 0 | 23 | 2017 | August | 35 | 30 | 2 | 5 | 2 | 0.0 | 0 | BB | BEL | Offline TA/TO | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 394.0 | 0.0 | 0 | Transient | 96.14 | 0 | 0 | Check-Out | 2017-09-06 |
| 119386 | City Hotel | 0 | 102 | 2017 | August | 35 | 31 | 2 | 5 | 3 | 0.0 | 0 | BB | FRA | Online TA | TA/TO | 0 | 0 | 0 | E | E | 0 | No Deposit | 9.0 | 0.0 | 0 | Transient | 225.43 | 0 | 2 | Check-Out | 2017-09-07 |
| 119387 | City Hotel | 0 | 34 | 2017 | August | 35 | 31 | 2 | 5 | 2 | 0.0 | 0 | BB | DEU | Online TA | TA/TO | 0 | 0 | 0 | D | D | 0 | No Deposit | 9.0 | 0.0 | 0 | Transient | 157.71 | 0 | 4 | Check-Out | 2017-09-07 |
| 119388 | City Hotel | 0 | 109 | 2017 | August | 35 | 31 | 2 | 5 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 89.0 | 0.0 | 0 | Transient | 104.40 | 0 | 0 | Check-Out | 2017-09-07 |
| 119389 | City Hotel | 0 | 205 | 2017 | August | 35 | 29 | 2 | 7 | 2 | 0.0 | 0 | HB | DEU | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 9.0 | 0.0 | 0 | Transient | 151.20 | 0 | 2 | Check-Out | 2017-09-07 |
119210 rows × 32 columns
country_wise_guests = df[df['is_canceled'] == 0]['country'].value_counts().reset_index()
country_wise_guests.columns = ['country', 'No of guests']
country_wise_guests
| country | No of guests | |
|---|---|---|
| 0 | PRT | 20977 |
| 1 | GBR | 9668 |
| 2 | FRA | 8468 |
| 3 | ESP | 6383 |
| 4 | DEU | 6067 |
| ... | ... | ... |
| 161 | BHR | 1 |
| 162 | DJI | 1 |
| 163 | MLI | 1 |
| 164 | NPL | 1 |
| 165 | FRO | 1 |
166 rows × 2 columns
basemap = folium.Map()
guests_map = px.choropleth(country_wise_guests, locations = country_wise_guests['country'],
color = country_wise_guests['No of guests'], hover_name = country_wise_guests['country'])
guests_map.show()
df.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 3 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 4 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Direct | Direct | 0 | 0 | 0 | A | C | 0 | No Deposit | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Corporate | Corporate | 0 | 0 | 0 | A | A | 0 | No Deposit | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
data = df[df['is_canceled'] == 0]
px.box(data_frame = data, x = 'reserved_room_type', y = 'adr', color = 'hotel', template = 'plotly_dark')
data_resort = df[(df['hotel'] == 'Resort Hotel') & (df['is_canceled'] == 0)]
data_city = df[(df['hotel'] == 'City Hotel') & (df['is_canceled'] == 0)]
resort_hotel = data_resort.groupby(['arrival_date_month'])['adr'].mean().reset_index()
resort_hotel
| arrival_date_month | adr | |
|---|---|---|
| 0 | April | 75.867816 |
| 1 | August | 181.205892 |
| 2 | December | 68.410104 |
| 3 | February | 54.147478 |
| 4 | January | 48.761125 |
| 5 | July | 150.122528 |
| 6 | June | 107.974850 |
| 7 | March | 57.056838 |
| 8 | May | 76.657558 |
| 9 | November | 48.706289 |
| 10 | October | 61.775449 |
| 11 | September | 96.416860 |
city_hotel=data_city.groupby(['arrival_date_month'])['adr'].mean().reset_index()
city_hotel
| arrival_date_month | adr | |
|---|---|---|
| 0 | April | 111.962267 |
| 1 | August | 118.674598 |
| 2 | December | 88.401855 |
| 3 | February | 86.520062 |
| 4 | January | 82.330983 |
| 5 | July | 115.818019 |
| 6 | June | 117.874360 |
| 7 | March | 90.658533 |
| 8 | May | 120.669827 |
| 9 | November | 86.946592 |
| 10 | October | 102.004672 |
| 11 | September | 112.776582 |
final_hotel = resort_hotel.merge(city_hotel, on = 'arrival_date_month')
final_hotel.columns = ['month', 'price_for_resort', 'price_for_city_hotel']
final_hotel
| month | price_for_resort | price_for_city_hotel | |
|---|---|---|---|
| 0 | April | 75.867816 | 111.962267 |
| 1 | August | 181.205892 | 118.674598 |
| 2 | December | 68.410104 | 88.401855 |
| 3 | February | 54.147478 | 86.520062 |
| 4 | January | 48.761125 | 82.330983 |
| 5 | July | 150.122528 | 115.818019 |
| 6 | June | 107.974850 | 117.874360 |
| 7 | March | 57.056838 | 90.658533 |
| 8 | May | 76.657558 | 120.669827 |
| 9 | November | 48.706289 | 86.946592 |
| 10 | October | 61.775449 | 102.004672 |
| 11 | September | 96.416860 | 112.776582 |
!pip install sort-dataframeby-monthorweek
!pip install sorted-months-weekdays
Requirement already satisfied: sort-dataframeby-monthorweek in c:\users\vanam ganesh\anaconda3\lib\site-packages (0.4) Requirement already satisfied: sorted-months-weekdays in c:\users\vanam ganesh\anaconda3\lib\site-packages (0.2)
import sort_dataframeby_monthorweek as sd
def sort_month(df, column_name):
return sd.Sort_Dataframeby_Month(df, column_name)
final_prices = sort_month(final_hotel, 'month')
final_prices
| month | price_for_resort | price_for_city_hotel | |
|---|---|---|---|
| 0 | January | 48.761125 | 82.330983 |
| 1 | February | 54.147478 | 86.520062 |
| 2 | March | 57.056838 | 90.658533 |
| 3 | April | 75.867816 | 111.962267 |
| 4 | May | 76.657558 | 120.669827 |
| 5 | June | 107.974850 | 117.874360 |
| 6 | July | 150.122528 | 115.818019 |
| 7 | August | 181.205892 | 118.674598 |
| 8 | September | 96.416860 | 112.776582 |
| 9 | October | 61.775449 | 102.004672 |
| 10 | November | 48.706289 | 86.946592 |
| 11 | December | 68.410104 | 88.401855 |
plt.figure(figsize = (17, 8))
px.line(final_prices, x = 'month', y = ['price_for_resort','price_for_city_hotel'],
title = 'Room price per night over the Months', template = 'plotly_dark')
<Figure size 1700x800 with 0 Axes>
resort_guests = data_resort['arrival_date_month'].value_counts().reset_index()
resort_guests.columns=['month','no of guests']
resort_guests
| month | no of guests | |
|---|---|---|
| 0 | August | 3257 |
| 1 | July | 3137 |
| 2 | October | 2575 |
| 3 | March | 2571 |
| 4 | April | 2550 |
| 5 | May | 2535 |
| 6 | February | 2308 |
| 7 | September | 2102 |
| 8 | June | 2037 |
| 9 | December | 2014 |
| 10 | November | 1975 |
| 11 | January | 1866 |
city_guests = data_city['arrival_date_month'].value_counts().reset_index()
city_guests.columns=['month','no of guests']
city_guests
| month | no of guests | |
|---|---|---|
| 0 | August | 5367 |
| 1 | July | 4770 |
| 2 | May | 4568 |
| 3 | June | 4358 |
| 4 | October | 4326 |
| 5 | September | 4283 |
| 6 | March | 4049 |
| 7 | April | 4010 |
| 8 | February | 3051 |
| 9 | November | 2676 |
| 10 | December | 2377 |
| 11 | January | 2249 |
final_guests = resort_guests.merge(city_guests,on='month')
final_guests.columns=['month','no of guests in resort','no of guest in city hotel']
final_guests
| month | no of guests in resort | no of guest in city hotel | |
|---|---|---|---|
| 0 | August | 3257 | 5367 |
| 1 | July | 3137 | 4770 |
| 2 | October | 2575 | 4326 |
| 3 | March | 2571 | 4049 |
| 4 | April | 2550 | 4010 |
| 5 | May | 2535 | 4568 |
| 6 | February | 2308 | 3051 |
| 7 | September | 2102 | 4283 |
| 8 | June | 2037 | 4358 |
| 9 | December | 2014 | 2377 |
| 10 | November | 1975 | 2676 |
| 11 | January | 1866 | 2249 |
final_guests = sort_month(final_guests,'month')
final_guests
| month | no of guests in resort | no of guest in city hotel | |
|---|---|---|---|
| 0 | January | 1866 | 2249 |
| 1 | February | 2308 | 3051 |
| 2 | March | 2571 | 4049 |
| 3 | April | 2550 | 4010 |
| 4 | May | 2535 | 4568 |
| 5 | June | 2037 | 4358 |
| 6 | July | 3137 | 4770 |
| 7 | August | 3257 | 5367 |
| 8 | September | 2102 | 4283 |
| 9 | October | 2575 | 4326 |
| 10 | November | 1975 | 2676 |
| 11 | December | 2014 | 2377 |
px.line(final_guests, x = 'month', y = ['no of guests in resort','no of guest in city hotel'],
title='Total no of guests per Months', template = 'plotly_dark')
filter = df['is_canceled'] == 0
data = df[filter]
data.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 3 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 4 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Direct | Direct | 0 | 0 | 0 | A | C | 0 | No Deposit | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Corporate | Corporate | 0 | 0 | 0 | A | A | 0 | No Deposit | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
data['total_nights'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']
data.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | ... | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | total_nights | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | ... | 0 | 0 | C | C | 3 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 | 0 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | ... | 0 | 0 | C | C | 4 | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 | 0 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Direct | Direct | ... | 0 | 0 | A | C | 0 | No Deposit | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 | 1 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Corporate | Corporate | ... | 0 | 0 | A | A | 0 | No Deposit | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 | 1 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | ... | 0 | 0 | A | A | 0 | No Deposit | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 | 2 |
5 rows × 33 columns
stay = data.groupby(['total_nights', 'hotel']).agg('count').reset_index()
stay = stay.iloc[:, :3]
stay = stay.rename(columns={'is_canceled':'Number of stays'})
stay
| total_nights | hotel | Number of stays | |
|---|---|---|---|
| 0 | 0 | City Hotel | 251 |
| 1 | 0 | Resort Hotel | 371 |
| 2 | 1 | City Hotel | 9155 |
| 3 | 1 | Resort Hotel | 6579 |
| 4 | 2 | City Hotel | 10983 |
| ... | ... | ... | ... |
| 57 | 46 | Resort Hotel | 1 |
| 58 | 48 | City Hotel | 1 |
| 59 | 56 | Resort Hotel | 1 |
| 60 | 60 | Resort Hotel | 1 |
| 61 | 69 | Resort Hotel | 1 |
62 rows × 3 columns
px.bar(data_frame = stay, x = 'total_nights', y = 'Number of stays', color = 'hotel', barmode = 'group',
template = 'plotly_dark')
plt.figure(figsize = (24, 12))
corr = df.corr()
sns.heatmap(corr, annot = True, linewidths = 1)
plt.show()
correlation = df.corr()['is_canceled'].abs().sort_values(ascending = False)
correlation
is_canceled 1.000000 lead_time 0.292876 total_of_special_requests 0.234877 required_car_parking_spaces 0.195701 booking_changes 0.144832 previous_cancellations 0.110139 is_repeated_guest 0.083745 company 0.083594 adults 0.058182 previous_bookings_not_canceled 0.057365 days_in_waiting_list 0.054301 agent 0.046770 adr 0.046492 babies 0.032569 stays_in_week_nights 0.025542 arrival_date_year 0.016622 arrival_date_week_number 0.008315 arrival_date_day_of_month 0.005948 children 0.004851 stays_in_weekend_nights 0.001323 Name: is_canceled, dtype: float64
# dropping columns that are not useful
useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes',
'reservation_status', 'country', 'days_in_waiting_list']
df.drop(useless_col, axis = 1, inplace = True)
df.head()
| hotel | is_canceled | lead_time | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | deposit_type | agent | company | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | Direct | Direct | 0 | 0 | 0 | C | No Deposit | 0.0 | 0.0 | Transient | 0.0 | 0 | 0 | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | Direct | Direct | 0 | 0 | 0 | C | No Deposit | 0.0 | 0.0 | Transient | 0.0 | 0 | 0 | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | Direct | Direct | 0 | 0 | 0 | A | No Deposit | 0.0 | 0.0 | Transient | 75.0 | 0 | 0 | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | Corporate | Corporate | 0 | 0 | 0 | A | No Deposit | 304.0 | 0.0 | Transient | 75.0 | 0 | 0 | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | Online TA | TA/TO | 0 | 0 | 0 | A | No Deposit | 240.0 | 0.0 | Transient | 98.0 | 0 | 1 | 2015-07-03 |
# creating numerical and categorical dataframes
cat_cols = [col for col in df.columns if df[col].dtype == 'O']
cat_cols
['hotel', 'arrival_date_month', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type', 'reservation_status_date']
cat_df = df[cat_cols]
cat_df.head()
| hotel | arrival_date_month | meal | market_segment | distribution_channel | reserved_room_type | deposit_type | customer_type | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | July | BB | Direct | Direct | C | No Deposit | Transient | 2015-07-01 |
| 1 | Resort Hotel | July | BB | Direct | Direct | C | No Deposit | Transient | 2015-07-01 |
| 2 | Resort Hotel | July | BB | Direct | Direct | A | No Deposit | Transient | 2015-07-02 |
| 3 | Resort Hotel | July | BB | Corporate | Corporate | A | No Deposit | Transient | 2015-07-02 |
| 4 | Resort Hotel | July | BB | Online TA | TA/TO | A | No Deposit | Transient | 2015-07-03 |
cat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])
cat_df['year'] = cat_df['reservation_status_date'].dt.year
cat_df['month'] = cat_df['reservation_status_date'].dt.month
cat_df['day'] = cat_df['reservation_status_date'].dt.day
cat_df.drop(['reservation_status_date','arrival_date_month'] , axis = 1, inplace = True)
cat_df.head()
| hotel | meal | market_segment | distribution_channel | reserved_room_type | deposit_type | customer_type | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | BB | Direct | Direct | C | No Deposit | Transient | 2015 | 7 | 1 |
| 1 | Resort Hotel | BB | Direct | Direct | C | No Deposit | Transient | 2015 | 7 | 1 |
| 2 | Resort Hotel | BB | Direct | Direct | A | No Deposit | Transient | 2015 | 7 | 2 |
| 3 | Resort Hotel | BB | Corporate | Corporate | A | No Deposit | Transient | 2015 | 7 | 2 |
| 4 | Resort Hotel | BB | Online TA | TA/TO | A | No Deposit | Transient | 2015 | 7 | 3 |
# printing unique values of each column
for col in cat_df.columns:
print(f"{col}: \n{cat_df[col].unique()}\n")
hotel: ['Resort Hotel' 'City Hotel'] meal: ['BB' 'FB' 'HB' 'SC' 'Undefined'] market_segment: ['Direct' 'Corporate' 'Online TA' 'Offline TA/TO' 'Complementary' 'Groups' 'Undefined' 'Aviation'] distribution_channel: ['Direct' 'Corporate' 'TA/TO' 'Undefined' 'GDS'] reserved_room_type: ['C' 'A' 'D' 'E' 'G' 'F' 'H' 'L' 'B'] deposit_type: ['No Deposit' 'Refundable' 'Non Refund'] customer_type: ['Transient' 'Contract' 'Transient-Party' 'Group'] year: [2015 2014 2016 2017] month: [ 7 5 4 6 3 8 9 1 11 10 12 2] day: [ 1 2 3 6 22 23 5 7 8 11 15 16 29 19 18 9 13 4 12 26 17 10 20 14 30 28 25 21 27 24 31]
# encoding categorical variables
cat_df['hotel'] = cat_df['hotel'].map({'Resort Hotel' : 0, 'City Hotel' : 1})
cat_df['meal'] = cat_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3, 'Undefined': 4})
cat_df['market_segment'] = cat_df['market_segment'].map({'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3,
'Complementary': 4, 'Groups': 5, 'Undefined': 6, 'Aviation': 7})
cat_df['distribution_channel'] = cat_df['distribution_channel'].map({'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Undefined': 3,
'GDS': 4})
cat_df['reserved_room_type'] = cat_df['reserved_room_type'].map({'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6,
'L': 7, 'B': 8})
cat_df['deposit_type'] = cat_df['deposit_type'].map({'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3})
cat_df['customer_type'] = cat_df['customer_type'].map({'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3})
cat_df['year'] = cat_df['year'].map({2015: 0, 2014: 1, 2016: 2, 2017: 3})
cat_df.head()
| hotel | meal | market_segment | distribution_channel | reserved_room_type | deposit_type | customer_type | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 |
| 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | 2 |
| 3 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 7 | 2 |
| 4 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 7 | 3 |
num_df = df.drop(columns = cat_cols, axis = 1)
num_df.drop('is_canceled', axis = 1, inplace = True)
num_df
| lead_time | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | agent | company | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 342 | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.00 | 0 | 0 |
| 1 | 737 | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.00 | 0 | 0 |
| 2 | 7 | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 75.00 | 0 | 0 |
| 3 | 13 | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | 0 | 0 | 0 | 304.0 | 0.0 | 75.00 | 0 | 0 |
| 4 | 14 | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | 0 | 0 | 0 | 240.0 | 0.0 | 98.00 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 119385 | 23 | 35 | 30 | 2 | 5 | 2 | 0.0 | 0 | 0 | 0 | 0 | 394.0 | 0.0 | 96.14 | 0 | 0 |
| 119386 | 102 | 35 | 31 | 2 | 5 | 3 | 0.0 | 0 | 0 | 0 | 0 | 9.0 | 0.0 | 225.43 | 0 | 2 |
| 119387 | 34 | 35 | 31 | 2 | 5 | 2 | 0.0 | 0 | 0 | 0 | 0 | 9.0 | 0.0 | 157.71 | 0 | 4 |
| 119388 | 109 | 35 | 31 | 2 | 5 | 2 | 0.0 | 0 | 0 | 0 | 0 | 89.0 | 0.0 | 104.40 | 0 | 0 |
| 119389 | 205 | 35 | 29 | 2 | 7 | 2 | 0.0 | 0 | 0 | 0 | 0 | 9.0 | 0.0 | 151.20 | 0 | 2 |
119210 rows × 16 columns
num_df.var()
lead_time 11422.361808 arrival_date_week_number 184.990111 arrival_date_day_of_month 77.107192 stays_in_weekend_nights 0.990258 stays_in_week_nights 3.599010 adults 0.330838 children 0.159070 babies 0.009508 is_repeated_guest 0.030507 previous_cancellations 0.713887 previous_bookings_not_canceled 2.244415 agent 11485.169679 company 2897.684308 adr 2543.589039 required_car_parking_spaces 0.060201 total_of_special_requests 0.628652 dtype: float64
# normalizing numerical variables
num_df['lead_time'] = np.log(num_df['lead_time'] + 1)
num_df['arrival_date_week_number'] = np.log(num_df['arrival_date_week_number'] + 1)
num_df['arrival_date_day_of_month'] = np.log(num_df['arrival_date_day_of_month'] + 1)
num_df['agent'] = np.log(num_df['agent'] + 1)
num_df['company'] = np.log(num_df['company'] + 1)
num_df['adr'] = np.log(num_df['adr'] + 1)
num_df.var()
lead_time 2.582757 arrival_date_week_number 0.440884 arrival_date_day_of_month 0.506325 stays_in_weekend_nights 0.990258 stays_in_week_nights 3.599010 adults 0.330838 children 0.159070 babies 0.009508 is_repeated_guest 0.030507 previous_cancellations 0.713887 previous_bookings_not_canceled 2.244415 agent 3.535793 company 1.346883 adr 0.515480 required_car_parking_spaces 0.060201 total_of_special_requests 0.628652 dtype: float64
num_df['adr'] = num_df['adr'].fillna(value = num_df['adr'].mean())
num_df.head()
| lead_time | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | agent | company | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5.837730 | 3.332205 | 0.693147 | 0 | 0 | 2 | 0.0 | 0 | 0 | 0 | 0 | 0.000000 | 0.0 | 0.000000 | 0 | 0 |
| 1 | 6.603944 | 3.332205 | 0.693147 | 0 | 0 | 2 | 0.0 | 0 | 0 | 0 | 0 | 0.000000 | 0.0 | 0.000000 | 0 | 0 |
| 2 | 2.079442 | 3.332205 | 0.693147 | 0 | 1 | 1 | 0.0 | 0 | 0 | 0 | 0 | 0.000000 | 0.0 | 4.330733 | 0 | 0 |
| 3 | 2.639057 | 3.332205 | 0.693147 | 0 | 1 | 1 | 0.0 | 0 | 0 | 0 | 0 | 5.720312 | 0.0 | 4.330733 | 0 | 0 |
| 4 | 2.708050 | 3.332205 | 0.693147 | 0 | 2 | 2 | 0.0 | 0 | 0 | 0 | 0 | 5.484797 | 0.0 | 4.595120 | 0 | 1 |
X = pd.concat([cat_df, num_df], axis = 1)
y = df['is_canceled']
X.shape, y.shape
((119210, 26), (119210,))
# splitting data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
X_train.head()
| hotel | meal | market_segment | distribution_channel | reserved_room_type | deposit_type | customer_type | year | month | day | lead_time | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | agent | company | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 81904 | 1 | 0 | 3 | 2 | 1 | 3 | 0 | 0 | 12 | 18 | 5.204007 | 2.995732 | 1.791759 | 2 | 4 | 2 | 0.0 | 0 | 0 | 1 | 0 | 3.295837 | 0.0 | 4.510860 | 0 | 0 |
| 66526 | 1 | 0 | 2 | 2 | 1 | 0 | 0 | 3 | 1 | 26 | 4.521789 | 2.833213 | 3.091042 | 1 | 2 | 2 | 0.0 | 0 | 0 | 0 | 0 | 2.302585 | 0.0 | 4.844187 | 0 | 0 |
| 95207 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 8 | 16 | 2.564949 | 3.555348 | 2.772589 | 1 | 0 | 2 | 0.0 | 0 | 0 | 0 | 0 | 2.708050 | 0.0 | 5.075174 | 0 | 0 |
| 8043 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 2 | 3 | 8 | 5.752573 | 3.637586 | 2.079442 | 2 | 7 | 2 | 0.0 | 0 | 0 | 0 | 0 | 5.484797 | 0.0 | 4.510860 | 0 | 1 |
| 9332 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 2 | 7 | 7 | 4.941642 | 3.850148 | 2.564949 | 2 | 5 | 2 | 0.0 | 0 | 0 | 0 | 0 | 5.484797 | 0.0 | 3.867026 | 0 | 0 |
X_test.head()
| hotel | meal | market_segment | distribution_channel | reserved_room_type | deposit_type | customer_type | year | month | day | lead_time | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | agent | company | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20042 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 2 | 1 | 15 | 0.000000 | 1.386294 | 2.708050 | 0 | 1 | 1 | 0.0 | 0 | 0 | 0 | 0 | 5.484797 | 0.0 | 3.713572 | 0 | 0 |
| 24451 | 0 | 0 | 3 | 2 | 1 | 0 | 1 | 2 | 5 | 22 | 4.770685 | 3.091042 | 2.772589 | 2 | 5 | 2 | 0.0 | 0 | 0 | 0 | 0 | 5.497168 | 0.0 | 3.648057 | 0 | 0 |
| 113817 | 1 | 0 | 2 | 2 | 1 | 0 | 0 | 3 | 6 | 12 | 3.555348 | 3.178054 | 2.197225 | 1 | 3 | 1 | 0.0 | 0 | 0 | 0 | 0 | 4.454347 | 0.0 | 4.691348 | 0 | 0 |
| 115233 | 1 | 3 | 2 | 2 | 1 | 0 | 0 | 3 | 7 | 3 | 1.386294 | 3.295837 | 3.401197 | 1 | 3 | 2 | 0.0 | 0 | 0 | 0 | 0 | 2.302585 | 0.0 | 5.013963 | 0 | 1 |
| 22831 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 2 | 4 | 8 | 4.343805 | 2.772589 | 1.386294 | 2 | 3 | 2 | 0.0 | 0 | 0 | 0 | 0 | 5.525453 | 0.0 | 4.434382 | 0 | 1 |
y_train.head(), y_test.head()
(81904 1 66526 1 95207 0 8043 1 9332 1 Name: is_canceled, dtype: int64, 20042 0 24451 0 113817 0 115233 0 22831 0 Name: is_canceled, dtype: int64)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
conf = confusion_matrix(y_test, y_pred_lr)
clf_report = classification_report(y_test, y_pred_lr)
print(f"Accuracy Score of Logistic Regression is : {acc_lr}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Logistic Regression is : 0.8099711992841764
Confusion Matrix :
[[21253 1307]
[ 5489 7714]]
Classification Report :
precision recall f1-score support
0 0.79 0.94 0.86 22560
1 0.86 0.58 0.69 13203
accuracy 0.81 35763
macro avg 0.82 0.76 0.78 35763
weighted avg 0.82 0.81 0.80 35763
knn = KNeighborsClassifier()
X_train = np.ascontiguousarray(X_train)
X_test = np.ascontiguousarray(X_test)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
conf = confusion_matrix(y_test, y_pred_knn)
clf_report = classification_report(y_test, y_pred_knn)
print(f"Accuracy Score of KNN is : {acc_knn}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of KNN is : 0.892906076112183
Confusion Matrix :
[[21765 795]
[ 3035 10168]]
Classification Report :
precision recall f1-score support
0 0.88 0.96 0.92 22560
1 0.93 0.77 0.84 13203
accuracy 0.89 35763
macro avg 0.90 0.87 0.88 35763
weighted avg 0.90 0.89 0.89 35763
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred_dtc = dtc.predict(X_test)
acc_dtc = accuracy_score(y_test, y_pred_dtc)
conf = confusion_matrix(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)
print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Decision Tree is : 0.9502558510192098
Confusion Matrix :
[[21658 902]
[ 877 12326]]
Classification Report :
precision recall f1-score support
0 0.96 0.96 0.96 22560
1 0.93 0.93 0.93 13203
accuracy 0.95 35763
macro avg 0.95 0.95 0.95 35763
weighted avg 0.95 0.95 0.95 35763
rd_clf = RandomForestClassifier()
rd_clf.fit(X_train, y_train)
y_pred_rd_clf = rd_clf.predict(X_test)
acc_rd_clf = accuracy_score(y_test, y_pred_rd_clf)
conf = confusion_matrix(y_test, y_pred_rd_clf)
clf_report = classification_report(y_test, y_pred_rd_clf)
print(f"Accuracy Score of Random Forest is : {acc_rd_clf}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Random Forest is : 0.9545340156027179
Confusion Matrix :
[[22396 164]
[ 1462 11741]]
Classification Report :
precision recall f1-score support
0 0.94 0.99 0.96 22560
1 0.99 0.89 0.94 13203
accuracy 0.95 35763
macro avg 0.96 0.94 0.95 35763
weighted avg 0.96 0.95 0.95 35763
ada = AdaBoostClassifier(base_estimator = dtc)
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
acc_ada = accuracy_score(y_test, y_pred_ada)
conf = confusion_matrix(y_test, y_pred_ada)
clf_report = classification_report(y_test, y_pred_ada)
print(f"Accuracy Score of Ada Boost Classifier is : {acc_ada}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9499482705589576
Confusion Matrix :
[[21646 914]
[ 876 12327]]
Classification Report :
precision recall f1-score support
0 0.96 0.96 0.96 22560
1 0.93 0.93 0.93 13203
accuracy 0.95 35763
macro avg 0.95 0.95 0.95 35763
weighted avg 0.95 0.95 0.95 35763
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
acc_gb = accuracy_score(y_test, y_pred_gb)
conf = confusion_matrix(y_test, y_pred_gb)
clf_report = classification_report(y_test, y_pred_gb)
print(f"Accuracy Score of Ada Boost Classifier is : {acc_gb}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9204764700947907
Confusion Matrix :
[[22440 120]
[ 2724 10479]]
Classification Report :
precision recall f1-score support
0 0.89 0.99 0.94 22560
1 0.99 0.79 0.88 13203
accuracy 0.92 35763
macro avg 0.94 0.89 0.91 35763
weighted avg 0.93 0.92 0.92 35763
xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
conf = confusion_matrix(y_test, y_pred_xgb)
clf_report = classification_report(y_test, y_pred_xgb)
print(f"Accuracy Score of Ada Boost Classifier is : {acc_xgb}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9835863881665409
Confusion Matrix :
[[22542 18]
[ 569 12634]]
Classification Report :
precision recall f1-score support
0 0.98 1.00 0.99 22560
1 1.00 0.96 0.98 13203
accuracy 0.98 35763
macro avg 0.99 0.98 0.98 35763
weighted avg 0.98 0.98 0.98 35763
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)
y_pred_etc = etc.predict(X_test)
acc_etc = accuracy_score(y_test, y_pred_etc)
conf = confusion_matrix(y_test, y_pred_etc)
clf_report = classification_report(y_test, y_pred_etc)
print(f"Accuracy Score of Ada Boost Classifier is : {acc_etc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9523529905209295
Confusion Matrix :
[[22346 214]
[ 1490 11713]]
Classification Report :
precision recall f1-score support
0 0.94 0.99 0.96 22560
1 0.98 0.89 0.93 13203
accuracy 0.95 35763
macro avg 0.96 0.94 0.95 35763
weighted avg 0.95 0.95 0.95 35763
lgbm = LGBMClassifier(learning_rate = 1)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)
acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
conf = confusion_matrix(y_test, y_pred_lgbm)
clf_report = classification_report(y_test, y_pred_lgbm)
print(f"Accuracy Score of Ada Boost Classifier is : {acc_lgbm}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
[LightGBM] [Info] Number of positive: 30996, number of negative: 52451
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1216
[LightGBM] [Info] Number of data points in the train set: 83447, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.371445 -> initscore=-0.526021
[LightGBM] [Info] Start training from score -0.526021
Accuracy Score of Ada Boost Classifier is : 0.949416995218522
Confusion Matrix :
[[21763 797]
[ 1012 12191]]
Classification Report :
precision recall f1-score support
0 0.96 0.96 0.96 22560
1 0.94 0.92 0.93 13203
accuracy 0.95 35763
macro avg 0.95 0.94 0.95 35763
weighted avg 0.95 0.95 0.95 35763
classifiers = [('Gradient Boosting Classifier', gb), ('XGboost', xgb), ('Decision Tree', dtc),
('Extra Tree', etc), ('Light Gradient', lgbm), ('Random Forest', rd_clf), ('Ada Boost', ada), ('Logistic', lr),
('Knn', knn)]
vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)
[LightGBM] [Info] Number of positive: 30996, number of negative: 52451 [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007571 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 1216 [LightGBM] [Info] Number of data points in the train set: 83447, number of used features: 26 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.371445 -> initscore=-0.526021 [LightGBM] [Info] Start training from score -0.526021
VotingClassifier(estimators=[('Gradient Boosting Classifier',
GradientBoostingClassifier()),
('XGboost',
XGBClassifier(base_score=None, booster='gbtree',
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, gamma=None,
gpu_id=None, grow_polic...
num_parallel_tree=None,
predictor=None, random_state=None, ...)),
('Decision Tree', DecisionTreeClassifier()),
('Extra Tree', ExtraTreesClassifier()),
('Light Gradient',
LGBMClassifier(learning_rate=1)),
('Random Forest', RandomForestClassifier()),
('Ada Boost',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
('Logistic', LogisticRegression()),
('Knn', KNeighborsClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. VotingClassifier(estimators=[('Gradient Boosting Classifier',
GradientBoostingClassifier()),
('XGboost',
XGBClassifier(base_score=None, booster='gbtree',
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, gamma=None,
gpu_id=None, grow_polic...
num_parallel_tree=None,
predictor=None, random_state=None, ...)),
('Decision Tree', DecisionTreeClassifier()),
('Extra Tree', ExtraTreesClassifier()),
('Light Gradient',
LGBMClassifier(learning_rate=1)),
('Random Forest', RandomForestClassifier()),
('Ada Boost',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
('Logistic', LogisticRegression()),
('Knn', KNeighborsClassifier())])GradientBoostingClassifier()
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=5, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=180, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)DecisionTreeClassifier()
ExtraTreesClassifier()
LGBMClassifier(learning_rate=1)
RandomForestClassifier()
DecisionTreeClassifier()
DecisionTreeClassifier()
LogisticRegression()
KNeighborsClassifier()
y_pred_vc = vc.predict(X_test)
acc_vtc = accuracy_score(y_test, y_pred_vc)
conf = confusion_matrix(y_test, y_pred_vc)
clf_report = classification_report(y_test, y_pred_vc)
print(f"Accuracy Score of Ada Boost Classifier is : {acc_vtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9652434079914995
Confusion Matrix :
[[22530 30]
[ 1213 11990]]
Classification Report :
precision recall f1-score support
0 0.95 1.00 0.97 22560
1 1.00 0.91 0.95 13203
accuracy 0.97 35763
macro avg 0.97 0.95 0.96 35763
weighted avg 0.97 0.97 0.96 35763
from tensorflow.keras.utils import to_categorical
X = pd.concat([cat_df, num_df], axis = 1)
y = to_categorical(df['is_canceled'])
# splitting data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
import keras
from keras.layers import Dense
from keras.models import Sequential
model = Sequential()
model.add(Dense(100, activation = 'relu', input_shape = (26, )))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(2, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_history = model.fit(X_train, y_train, validation_data = (X_test, y_test),
epochs = 100)
Epoch 1/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.3595 - accuracy: 0.8476 - val_loss: 0.2459 - val_accuracy: 0.9108 Epoch 2/100 2608/2608 [==============================] - 12s 4ms/step - loss: 0.1780 - accuracy: 0.9387 - val_loss: 0.1419 - val_accuracy: 0.9515 Epoch 3/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.1277 - accuracy: 0.9583 - val_loss: 0.1103 - val_accuracy: 0.9634 Epoch 4/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.1046 - accuracy: 0.9657 - val_loss: 0.0831 - val_accuracy: 0.9748 Epoch 5/100 2608/2608 [==============================] - 12s 4ms/step - loss: 0.0914 - accuracy: 0.9704 - val_loss: 0.0819 - val_accuracy: 0.9767 Epoch 6/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0801 - accuracy: 0.9748 - val_loss: 0.0772 - val_accuracy: 0.9779 Epoch 7/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0733 - accuracy: 0.9769 - val_loss: 0.0711 - val_accuracy: 0.9771 Epoch 8/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0672 - accuracy: 0.9787 - val_loss: 0.0631 - val_accuracy: 0.9824 Epoch 9/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0633 - accuracy: 0.9801 - val_loss: 0.0795 - val_accuracy: 0.9760 Epoch 10/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0584 - accuracy: 0.9819 - val_loss: 0.0568 - val_accuracy: 0.9837 Epoch 11/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0560 - accuracy: 0.9822 - val_loss: 0.0529 - val_accuracy: 0.9839 Epoch 12/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0531 - accuracy: 0.9828 - val_loss: 0.0456 - val_accuracy: 0.9875 Epoch 13/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0517 - accuracy: 0.9839 - val_loss: 0.0613 - val_accuracy: 0.9805 Epoch 14/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0518 - accuracy: 0.9834 - val_loss: 0.0491 - val_accuracy: 0.9874 Epoch 15/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0468 - accuracy: 0.9849 - val_loss: 0.0476 - val_accuracy: 0.9859 Epoch 16/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0470 - accuracy: 0.9849 - val_loss: 0.0538 - val_accuracy: 0.9829 Epoch 17/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0444 - accuracy: 0.9861 - val_loss: 0.0732 - val_accuracy: 0.9803 Epoch 18/100 2608/2608 [==============================] - 12s 4ms/step - loss: 0.0428 - accuracy: 0.9861 - val_loss: 0.0414 - val_accuracy: 0.9882 Epoch 19/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0440 - accuracy: 0.9860 - val_loss: 0.0365 - val_accuracy: 0.9906 Epoch 20/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0419 - accuracy: 0.9868 - val_loss: 0.0520 - val_accuracy: 0.9846 Epoch 21/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0406 - accuracy: 0.9872 - val_loss: 0.0583 - val_accuracy: 0.9823 Epoch 22/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0395 - accuracy: 0.9876 - val_loss: 0.0604 - val_accuracy: 0.9817 Epoch 23/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0390 - accuracy: 0.9878 - val_loss: 0.0429 - val_accuracy: 0.9874 Epoch 24/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0391 - accuracy: 0.9872 - val_loss: 0.0356 - val_accuracy: 0.9902 Epoch 25/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0381 - accuracy: 0.9879 - val_loss: 0.0446 - val_accuracy: 0.9873 Epoch 26/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0351 - accuracy: 0.9890 - val_loss: 0.0320 - val_accuracy: 0.9904 Epoch 27/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0368 - accuracy: 0.9884 - val_loss: 0.0451 - val_accuracy: 0.9867 Epoch 28/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0374 - accuracy: 0.9882 - val_loss: 0.0335 - val_accuracy: 0.9908 Epoch 29/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0335 - accuracy: 0.9894 - val_loss: 0.0378 - val_accuracy: 0.9891 Epoch 30/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0329 - accuracy: 0.9897 - val_loss: 0.0399 - val_accuracy: 0.9889 Epoch 31/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0346 - accuracy: 0.9894 - val_loss: 0.0361 - val_accuracy: 0.9890 Epoch 32/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0321 - accuracy: 0.9900 - val_loss: 0.0338 - val_accuracy: 0.9897 Epoch 33/100 2608/2608 [==============================] - 15s 6ms/step - loss: 0.0318 - accuracy: 0.9896 - val_loss: 0.0348 - val_accuracy: 0.9907 Epoch 34/100 2608/2608 [==============================] - 15s 6ms/step - loss: 0.0343 - accuracy: 0.9889 - val_loss: 0.0779 - val_accuracy: 0.9767 Epoch 35/100 2608/2608 [==============================] - 15s 6ms/step - loss: 0.0297 - accuracy: 0.9907 - val_loss: 0.0434 - val_accuracy: 0.9878 Epoch 36/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0303 - accuracy: 0.9902 - val_loss: 0.0368 - val_accuracy: 0.9905 Epoch 37/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0300 - accuracy: 0.9906 - val_loss: 0.0408 - val_accuracy: 0.9898 Epoch 38/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0296 - accuracy: 0.9904 - val_loss: 0.0616 - val_accuracy: 0.9832 Epoch 39/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0309 - accuracy: 0.9903 - val_loss: 0.0369 - val_accuracy: 0.9912 Epoch 40/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0296 - accuracy: 0.9904 - val_loss: 0.0473 - val_accuracy: 0.9878 Epoch 41/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0325 - accuracy: 0.9900 - val_loss: 0.0303 - val_accuracy: 0.9916 Epoch 42/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0297 - accuracy: 0.9907 - val_loss: 0.0637 - val_accuracy: 0.9836 Epoch 43/100 2608/2608 [==============================] - 12s 4ms/step - loss: 0.0288 - accuracy: 0.9910 - val_loss: 0.0364 - val_accuracy: 0.9892 Epoch 44/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0264 - accuracy: 0.9912 - val_loss: 0.0350 - val_accuracy: 0.9897 Epoch 45/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0283 - accuracy: 0.9910 - val_loss: 0.0374 - val_accuracy: 0.9886 Epoch 46/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0291 - accuracy: 0.9912 - val_loss: 0.0534 - val_accuracy: 0.9854 Epoch 47/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0263 - accuracy: 0.9915 - val_loss: 0.0316 - val_accuracy: 0.9907 Epoch 48/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0280 - accuracy: 0.9913 - val_loss: 0.0384 - val_accuracy: 0.9894 Epoch 49/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0259 - accuracy: 0.9914 - val_loss: 0.0400 - val_accuracy: 0.9888 Epoch 50/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0277 - accuracy: 0.9910 - val_loss: 0.0425 - val_accuracy: 0.9884 Epoch 51/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0277 - accuracy: 0.9916 - val_loss: 0.0290 - val_accuracy: 0.9912 Epoch 52/100 2608/2608 [==============================] - 12s 4ms/step - loss: 0.0244 - accuracy: 0.9922 - val_loss: 0.0521 - val_accuracy: 0.9874 Epoch 53/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0268 - accuracy: 0.9913 - val_loss: 0.0257 - val_accuracy: 0.9925 Epoch 54/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0262 - accuracy: 0.9916 - val_loss: 0.0487 - val_accuracy: 0.9876 Epoch 55/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0259 - accuracy: 0.9917 - val_loss: 0.0332 - val_accuracy: 0.9897 Epoch 56/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0240 - accuracy: 0.9922 - val_loss: 0.0325 - val_accuracy: 0.9909 Epoch 57/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0256 - accuracy: 0.9914 - val_loss: 0.0364 - val_accuracy: 0.9898 Epoch 58/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0252 - accuracy: 0.9924 - val_loss: 0.0329 - val_accuracy: 0.9907 Epoch 59/100 2608/2608 [==============================] - 12s 4ms/step - loss: 0.0254 - accuracy: 0.9920 - val_loss: 0.0789 - val_accuracy: 0.9861 Epoch 60/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0237 - accuracy: 0.9926 - val_loss: 0.0432 - val_accuracy: 0.9887 Epoch 61/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0239 - accuracy: 0.9922 - val_loss: 0.0275 - val_accuracy: 0.9922 Epoch 62/100 2608/2608 [==============================] - 12s 4ms/step - loss: 0.0226 - accuracy: 0.9927 - val_loss: 0.0472 - val_accuracy: 0.9892 Epoch 63/100 2608/2608 [==============================] - 11s 4ms/step - loss: 0.0222 - accuracy: 0.9927 - val_loss: 0.0302 - val_accuracy: 0.9914 Epoch 64/100 2608/2608 [==============================] - 12s 4ms/step - loss: 0.0242 - accuracy: 0.9919 - val_loss: 0.0349 - val_accuracy: 0.9907 Epoch 65/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0238 - accuracy: 0.9924 - val_loss: 0.0312 - val_accuracy: 0.9917 Epoch 66/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0220 - accuracy: 0.9928 - val_loss: 0.0350 - val_accuracy: 0.9901 Epoch 67/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0247 - accuracy: 0.9919 - val_loss: 0.0398 - val_accuracy: 0.9905 Epoch 68/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0214 - accuracy: 0.9932 - val_loss: 0.0397 - val_accuracy: 0.9897 Epoch 69/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0221 - accuracy: 0.9929 - val_loss: 0.0346 - val_accuracy: 0.9908 Epoch 70/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0233 - accuracy: 0.9928 - val_loss: 0.0336 - val_accuracy: 0.9910 Epoch 71/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0221 - accuracy: 0.9931 - val_loss: 0.0376 - val_accuracy: 0.9902 Epoch 72/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0213 - accuracy: 0.9931 - val_loss: 0.0384 - val_accuracy: 0.9914 Epoch 73/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0223 - accuracy: 0.9928 - val_loss: 0.0562 - val_accuracy: 0.9808 Epoch 74/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0212 - accuracy: 0.9929 - val_loss: 0.0392 - val_accuracy: 0.9876 Epoch 75/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0229 - accuracy: 0.9924 - val_loss: 0.0337 - val_accuracy: 0.9897 Epoch 76/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0208 - accuracy: 0.9933 - val_loss: 0.0476 - val_accuracy: 0.9891 Epoch 77/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0203 - accuracy: 0.9933 - val_loss: 0.0347 - val_accuracy: 0.9916 Epoch 78/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0208 - accuracy: 0.9930 - val_loss: 0.0302 - val_accuracy: 0.9919 Epoch 79/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0205 - accuracy: 0.9933 - val_loss: 0.0360 - val_accuracy: 0.9909 Epoch 80/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0205 - accuracy: 0.9935 - val_loss: 0.0280 - val_accuracy: 0.9925 Epoch 81/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0203 - accuracy: 0.9933 - val_loss: 0.0326 - val_accuracy: 0.9917 Epoch 82/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0199 - accuracy: 0.9936 - val_loss: 0.0434 - val_accuracy: 0.9893 Epoch 83/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0194 - accuracy: 0.9938 - val_loss: 0.0388 - val_accuracy: 0.9910 Epoch 84/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0184 - accuracy: 0.9942 - val_loss: 0.0298 - val_accuracy: 0.9918 Epoch 85/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0212 - accuracy: 0.9930 - val_loss: 0.0404 - val_accuracy: 0.9893 Epoch 86/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0213 - accuracy: 0.9930 - val_loss: 0.0285 - val_accuracy: 0.9926 Epoch 87/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0204 - accuracy: 0.9935 - val_loss: 0.0318 - val_accuracy: 0.9927 Epoch 88/100 2608/2608 [==============================] - 12s 5ms/step - loss: 0.0186 - accuracy: 0.9941 - val_loss: 0.0331 - val_accuracy: 0.9920 Epoch 89/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0208 - accuracy: 0.9935 - val_loss: 0.0353 - val_accuracy: 0.9904 Epoch 90/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0193 - accuracy: 0.9936 - val_loss: 0.0427 - val_accuracy: 0.9883 Epoch 91/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0180 - accuracy: 0.9940 - val_loss: 0.0390 - val_accuracy: 0.9911 Epoch 92/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0191 - accuracy: 0.9938 - val_loss: 0.0324 - val_accuracy: 0.9912 Epoch 93/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0173 - accuracy: 0.9942 - val_loss: 0.0498 - val_accuracy: 0.9862 Epoch 94/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0180 - accuracy: 0.9941 - val_loss: 0.0505 - val_accuracy: 0.9867 Epoch 95/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0179 - accuracy: 0.9941 - val_loss: 0.0710 - val_accuracy: 0.9863 Epoch 96/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0188 - accuracy: 0.9939 - val_loss: 0.0368 - val_accuracy: 0.9917 Epoch 97/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0172 - accuracy: 0.9943 - val_loss: 0.0353 - val_accuracy: 0.9918 Epoch 98/100 2608/2608 [==============================] - 15s 6ms/step - loss: 0.0189 - accuracy: 0.9936 - val_loss: 0.0377 - val_accuracy: 0.9920 Epoch 99/100 2608/2608 [==============================] - 14s 5ms/step - loss: 0.0170 - accuracy: 0.9941 - val_loss: 0.0306 - val_accuracy: 0.9921 Epoch 100/100 2608/2608 [==============================] - 13s 5ms/step - loss: 0.0185 - accuracy: 0.9941 - val_loss: 0.0370 - val_accuracy: 0.9923
plt.figure(figsize = (12, 6))
train_loss = model_history.history['loss']
val_loss = model_history.history['val_loss']
epoch = range(1, 101)
loss = pd.DataFrame({'train_loss' : train_loss, 'val_loss' : val_loss})
px.line(data_frame = loss, x = epoch, y = ['val_loss', 'train_loss'], title = 'Training and Validation Loss',
template = 'plotly_dark')
<Figure size 1200x600 with 0 Axes>
plt.figure(figsize = (12, 6))
train_acc = model_history.history['accuracy']
val_acc = model_history.history['val_accuracy']
epoch = range(1, 101)
accuracy = pd.DataFrame({'train_acc' : train_acc, 'val_acc' : val_acc})
px.line(data_frame = accuracy, x = epoch, y = ['val_acc', 'train_acc'], title = 'Training and Validation Accuracy',
template = 'plotly_dark')
<Figure size 1200x600 with 0 Axes>
acc_ann = model.evaluate(X_test, y_test)[1]
print(f'Accuracy of model is {acc_ann}')
1118/1118 [==============================] - 3s 3ms/step - loss: 0.0370 - accuracy: 0.9923 Accuracy of model is 0.9923384785652161
models = pd.DataFrame({
'Model' : ['Logistic Regression', 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier','Ada Boost Classifier',
'Gradient Boosting Classifier', 'XgBoost', 'Extra Trees Classifier', 'LGBM', 'Voting Classifier'
, 'ANN'],
'Score' : [acc_lr, acc_knn, acc_dtc, acc_rd_clf, acc_ada, acc_gb, acc_xgb, acc_etc, acc_lgbm, acc_vtc, acc_ann]
})
models.sort_values(by = 'Score', ascending = False)
| Model | Score | |
|---|---|---|
| 10 | ANN | 0.992338 |
| 6 | XgBoost | 0.983586 |
| 9 | Voting Classifier | 0.965243 |
| 3 | Random Forest Classifier | 0.954534 |
| 7 | Extra Trees Classifier | 0.952353 |
| 2 | Decision Tree Classifier | 0.950256 |
| 4 | Ada Boost Classifier | 0.949948 |
| 8 | LGBM | 0.949417 |
| 5 | Gradient Boosting Classifier | 0.920476 |
| 1 | KNN | 0.892906 |
| 0 | Logistic Regression | 0.809971 |
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', title = 'Models Comparison')